Importing Packages

In [3]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import json
import requests
import time
import sys 
from math import log
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
import plotly.figure_factory as ff
import chart_studio.tools as tls
import chart_studio.plotly as py

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

Acquiring Personal Spotify Data

  • Visit https://www.spotify.com/us/account/privacy/ -- log into Spotify account and scroll to the bottom and request data.
  • Receive a downloadable zip file with listening data from Spotify's team in around 1-3 days.
  • Move MyData Folder to desired directory.

Load & extract streaming data from JSON

In [4]:
with open('MyData/StreamingHistory0.json', encoding='utf8') as f:
    jsondata1 = json.load(f)
    
with open('MyData/StreamingHistory1.json', encoding='utf8') as f:
    jsondata2 = json.load(f)
    
jsondata = jsondata1 + jsondata2

Extract data for each listening instance

In [5]:
my_streaming = pd.DataFrame()

def extract_json_value(column_name,data):
    return [i[column_name] for i in data]

# For each streaming instance:

# Track Name
# Artist Name
# Timestamp of End Time listening to that track stream
# Milliseconds listened in that instance

my_streaming['track_name'] = extract_json_value('trackName',jsondata)
my_streaming['artist_name'] = extract_json_value('artistName',jsondata)
my_streaming['end_time'] = extract_json_value('endTime',jsondata)
my_streaming['ms_played'] = extract_json_value('msPlayed',jsondata)

Accessing the Spotify API

Create an App

1) Go to https://developer.spotify.com/dashboard/applications

2) Create an App

3) Name your App

4) Go to App --> Find Client_ID & Client Secret

Retrieve token using account details

In [6]:
username = 'shahv1057'

# Copy-paste previously found client details
client_id ='d91dbe3fe689448aa5203a4b639c2a2f' 
client_secret = 'a9fbd58047704e55a4abf4469fd66e2e'

# This can be any localhost site
redirect_uri = 'http://localhost:1234/callback'

scope = 'user-read-recently-played'

# Running this cell will open a prompt at 'redirect_uri', click 'agree' to authorize and connect to API
token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)

Get individual track IDs every track in Dataframe for Spotify API

In [7]:
def get_id(track_name,artist, token):
    '''
    Input: Track Name, Artist Name, and API token
    Output: Spotify's unique Track ID for that track
    '''
    
    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
    }
    
    trackandartist = track_name+ " " + artist
    
    params = [
    #q is the search query parameter
    ('q',trackandartist ),
    ('type', 'track'),
    ]
    
    try:
        response = requests.get('https://api.spotify.com/v1/search', 
                    headers = headers, params = params, timeout = 10)
        json = response.json()
        track_id = json['tracks']['items'][0]['id']
        return track_id
    
    except:
        return None
In [8]:
# This may take several minutes, up to close to an hour mattering on your listening history
my_streaming["track_id"] = my_streaming.apply(lambda x: get_id(x["track_name"],x["artist_name"],token),axis=1)

Acquire Spotify's audio feature data for all tracks in my_streaming Dataframe

In [ ]:
trackid = list(my_streaming["track_id"].dropna().unique())

my_features = pd.DataFrame(columns=[
    "track_id","energy","tempo","speechiness",
    "acousticness","instrumentalness","danceability",
    "loudness","valence"
    ])

# Authorize access to audio features 
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, 
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager,requests_timeout=100)

# For each Track ID in my Spotify-provided listening history...
# import spotify's audio features into my_features DataFrame

for track in trackid:
    print (track)
    features = sp.audio_features(tracks = [track])[0]
    if features is not None:
        my_features = my_features.append({
            "track_id":track,
            "energy":features['energy'], 
            "tempo":features['tempo'],
            "speechiness":features['speechiness'],
            "acousticness":features['acousticness'],
            "instrumentalness":features['instrumentalness'],
            "danceability":features['danceability'],
            "loudness":features['loudness'],
            "valence":features['valence'],
            },ignore_index=True)

Merge track and artist names to features dataframe

In [37]:
my_features = my_features.merge(my_streaming[['track_id','track_name','artist_name']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

Add Album names

In [17]:
def acquire_album(track_id):
    return sp.track(track_id)['album']['name']

my_features['album'] = my_features['track_id'].apply(acquire_album)
my_streaming = my_streaming.merge(my_features[['track_id','album']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

Pickle Dataframes

In [22]:
my_streaming.to_pickle('my_streaming.pkl')
my_features.to_pickle('my_features.pkl')

Interactive Data Analysis with Plotly

Top Songs

In [26]:
songs = my_streaming.copy()
songs['month_year'] = pd.to_datetime(songs['end_time']).dt.to_period('M')
songs['ms_played'] = songs['ms_played'] / 60000
top20songs = songs.groupby('track_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
songs = songs[songs['track_name'].isin(top20songs.index)]

plotly_songs_df = songs.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_songs_df['month_year'] = plotly_songs_df['month_year'].astype(str)
plotly_songs_df['ms_played'] = plotly_songs_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']

colors = ["Black","#240011","#480020","#6D002E","#91003A","#B30046","#D10550","#EC0E5B","#E3416A","#DE7082","#DE999E","#E3BFBE","#ECDFDE"][::-1]
labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}
fig = px.bar(plotly_songs_df, 
             x='track_name',
             y='ms_played',
             hover_data=['track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

Top Artists

In [27]:
artists = my_streaming.copy()
artists['month_year'] = pd.to_datetime(artists['end_time']).dt.to_period('M')
artists['ms_played'] = artists['ms_played'] / 60000
top20artists = artists.groupby('artist_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
artists = artists[artists['artist_name'].isin(top20artists.index)]

plotly_artists_df = artists.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_artists_df['month_year'] = plotly_artists_df['month_year'].astype(str)
plotly_artists_df['ms_played'] = plotly_artists_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(237,248,251)','rgb(237,248,251)','rgb(204,236,230)','rgb(204,236,230)',
          'rgb(153,216,201)','rgb(153,216,201)','rgb(102,194,164)','rgb(102,194,164)',
          'rgb(44,162,95)','rgb(44,162,95)','rgb(0,109,44)','rgb(0,109,44)','rgb(0,85,23)']

labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}

fig = px.bar(plotly_artists_df, 
             x='artist_name',
             y='ms_played',
             hover_data=['track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

Top Albums

In [30]:
albums = my_streaming.copy()
albums['month_year'] = pd.to_datetime(albums['end_time']).dt.to_period('M')
albums['ms_played'] = albums['ms_played'] / 60000
top20albums = albums.groupby('album')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
albums = albums[albums['album'].isin(top20albums.index)]

plotly_albums_df = albums.groupby(['album','track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_albums_df['month_year'] = plotly_albums_df['month_year'].astype(str)
plotly_albums_df['ms_played'] = plotly_albums_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(254,240,217)','rgb(254,240,217)','rgb(253,212,158)','rgb(253,212,158)',
          'rgb(253,187,132)','rgb(253,187,132)','rgb(252,141,89)','rgb(252,141,89)',
          'rgb(227,74,51)','rgb(227,74,51)','rgb(179,0,0)','rgb(179,0,0)','rgb(110,0,0)']
labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist','album':'Album'}

fig = px.bar(plotly_albums_df, 
             x='album',
             y='ms_played',
             hover_data=['album','track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

Clustering Songs with the K-Mean Algorithim

Data preprocessing

Filter for tracks listened to for 15+ min in last year

In [144]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
listens15 = minplayed[minplayed.ms_played > 13]
song_prefs = listens15.merge(my_features,how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

Create all-numeric numpy array, X

In [145]:
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()
X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)

Scale numeric columns not between 0 and 1

In [147]:
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))

Data analysis

Correlations b/n audio features

In [148]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(X.corr(),cmap=cmap)
Out[148]:
<matplotlib.axes._subplots.AxesSubplot at 0x1367f66d8>

Audio features distributions

In [149]:
distplot_data = [list(X[feature]) for feature in X.columns]
group_labels = list(X.columns)

fig = ff.create_distplot(distplot_data,
                         group_labels,
                         bin_size=.02,
                         show_hist=False
                        )
fig.update_layout({"template": 'plotly_white'})
fig.update_yaxes(range=[0, 7])
fig.show()

Choosing Number of Mood Clusters for K-Means

In [150]:
inertia = {}
for n in range(1,15):
    kmeans = KMeans(n_clusters=n, random_state=1,n_jobs=-1).fit(X.values)
    inertia[n] = kmeans.inertia_
cluster_num = list(inertia.keys())
inertia_vals = list(inertia.values())
In [151]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=cluster_num, y=inertia_vals,
                    mode='lines+markers'))
elbow = [dict(type="circle",
                            xref="x", yref="y",
                            x0=cluster_num[3]-.4, y0=inertia_vals[3]-5,
                            x1 = cluster_num[3]+.4, y1 = inertia_vals[3]+5,
                            line=dict(color="Red"))]

fig.update_layout(xaxis_title="Number of Clusters",
    yaxis_title="Inertia",
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(label="None",
                     method="relayout",
                     args=["shapes", []]),
                dict(label="Elbow",
                     method="relayout",
                     args=["shapes", elbow])
            ])])
config = {'displayModeBar': False}
fig.show(config=config)

K-Means

In [152]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters,n_jobs=-1,random_state=1).fit(X.values)
y_kmeans = kmeans.predict(X.values)

PCA - 2D

In [153]:
pca_2D = PCA(n_components=2)
principal_components_2D = pca_2D.fit_transform(X.values)
pc2D = pd.DataFrame(principal_components_2D)

pc2D['label'] = [str(y) for y in y_kmeans]
pc2D.columns = ['x', 'y','label']

fig = px.scatter(pc2D, 
                 x='x', 
                 y='y',
                 color='label',
                 color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
                 category_orders={"label": ["0", "1", "2", "3"]})
In [154]:
print (pca_2D.explained_variance_ratio_, sum(pca_2D.explained_variance_ratio_))
[0.32153694 0.21088925] 0.5324261895032345

PCA - 3D

In [156]:
from mpl_toolkits.mplot3d import Axes3D

pca_3D = PCA(n_components=3)
principal_components_3D = pca_3D.fit_transform(X.values)
pc3D = pd.DataFrame(principal_components_3D)

pc3D['label'] = [str(y) for y in y_kmeans]
pc3D.columns = ['x', 'y', 'z', 'label']

fig = px.scatter_3d(pc3D, 
                 x='x', 
                 y='y',
                 z='z',
                 color='label',
                 color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
                 category_orders={"label": ["0", "1", "2", "3"]})
In [157]:
print (pca_3D.explained_variance_ratio_, sum(pca_3D.explained_variance_ratio_))
[0.32153694 0.21088925 0.16636669] 0.6987928758094093

Defining Moods for each K-Means Cluster

Number of songs in each cluster

In [158]:
song_prefs['label'] = y_kmeans
# shuffle dataset
fig = sns.barplot(x=song_prefs['label'].value_counts().index, 
                  y=song_prefs['label'].value_counts()
                 )

plt.title('# of Songs in each Group')
plt.ylabel('')
fig = fig.get_figure()
fig.set_size_inches(10, 4)
fig.show()
/Users/veeralshah/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py:445: UserWarning:

Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.

Scale all audio features to have [mean=0 & variance=1] for intuitive feature comparison

In [160]:
scaler = StandardScaler()
sns.set(font_scale=1.6,font='Times New Roman')
fig = sns.heatmap(scaler.fit_transform(song_prefs.groupby('label').mean()).T,
                  cmap='coolwarm',
                  yticklabels=[x.capitalize() for x in list(X.columns)],
                  annot=True)
fig = fig.get_figure()
fig.set_size_inches(16, 8)

Using a Random Forest Classifier to Analyze Cluster Accuracy

Create training and test sets from data

In [165]:
scaler = StandardScaler()
Xtrain, Xtest, ytrain, ytest = train_test_split(scaler.fit_transform(X.values),y_kmeans,test_size =.25,random_state=1)

Run Random Forest Classifier

In [166]:
clf = RandomForestClassifier(n_estimators=30, random_state=10,criterion='entropy')
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

Audio Feature Importance

In [167]:
print ([(x,y) for x,y in zip(list(np.round(clf.feature_importances_,2)),group_labels)])
[(0.08, 'energy'), (0.23, 'tempo'), (0.04, 'speechiness'), (0.19, 'acousticness'), (0.02, 'instrumentalness'), (0.06, 'danceability'), (0.08, 'loudness'), (0.3, 'valence')]

Classification results

In [168]:
moods = ['Hype','Angsty','Happy',"Sad"]
classification_matrix = np.zeros((4,4))
for x,y in zip(ytest,ypred):
    classification_matrix[x,y]+=1
ax = sns.heatmap(classification_matrix,
                 cmap='Blues',
                 cbar=False,
                 annot=True,
                 xticklabels = moods,
                 yticklabels= moods)
ax.set(xlabel='Preds', ylabel='True')
plt.show()

Creating Mood-Based Spotify Playlists

Authorize access to User playlists

In [169]:
from spotipy.oauth2 import SpotifyOAuth

scope = 'playlist-modify-public'
token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id,client_secret,redirect_uri,scope=scope,username=username))

Create dataset of songs based on minutes listened

In [172]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]
In [ ]:
song_prefs = listens_mins.merge(my_features,
                                how="left",
                                left_on= "track_id", 
                                right_on="track_id").drop_duplicates().reset_index(drop=True)
In [181]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]

song_prefs = listens_mins.merge(my_features,
                                how="left",
                                left_on= "track_id", 
                                right_on="track_id").drop_duplicates().reset_index(drop=True)
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()

X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))

Fit K-Means algorithim to new data

In [182]:
y_kmeans = kmeans.predict(X.values)
song_prefs['label'] = y_kmeans

Create new mood-specific playlists directly in Spotify

In [186]:
def create_mood_playlists(moods, df, num_clusters, playlist_length):
    for moodnum in range(num_clusters):
        data = df[df.label==moodnum]
        sp.user_playlist_create(username, moods[moodnum])
        playlist_id = sp.user_playlists(username)['items'][0]['id']
        playlist_song_IDs = list(data['track_id'].sample(playlist_length))
        sp.user_playlist_add_tracks(username, playlist_id, list(playlist_song_IDs))
        
In [ ]:
moods = ['Sad','Happy','Angsty',"Hype"]
num_clusters = 4
playlist_length = 20

create_mood_playlists(moods, song_prefs, num_clusters, playlist_length)